library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.1     ✔ tibble    3.3.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Palmer penguins data

The Palmer penguin data records several physical measurements of three penguin species from Palmer Station, Antarctica. Use the code below to load in a tidy version of this data, from bwu62. The .csv file must be in your data folder.

## This assumes that:
### STAT240/data/ contains the data file
### STAT240/lecture/sect04-ggplot/ is your working directory.
### If this gives you "Error: could not find file ... in working directory ...", go to Session > Set Working Directory > To Source File Location, and try again.
### If that doesn't work, then you downloaded one or both files to the wrong place, or they have the wrong name - make sure they don't have a " (1)" or "-1" at the end of their names, which can happen when you download multiple times.

penguins <- read_csv("../../data/penguins.csv")
## Rows: 333 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): species, island, sex
## dbl (5): bill_length_mm, bill_depth_mm, flipper_length_mm, body_mass_g, year
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
view(penguins)

Now explore the data.

glimpse(penguins)
## Rows: 333
## Columns: 8
## $ species           <chr> "Adelie", "Adelie", "Adelie", "Adelie", "Adelie", "A…
## $ island            <chr> "Torgersen", "Torgersen", "Torgersen", "Torgersen", …
## $ bill_length_mm    <dbl> 39.1, 39.5, 40.3, 36.7, 39.3, 38.9, 39.2, 41.1, 38.6…
## $ bill_depth_mm     <dbl> 18.7, 17.4, 18.0, 19.3, 20.6, 17.8, 19.6, 17.6, 21.2…
## $ flipper_length_mm <dbl> 181, 186, 195, 193, 190, 181, 195, 182, 191, 198, 18…
## $ body_mass_g       <dbl> 3750, 3800, 3250, 3450, 3650, 3625, 4675, 3200, 3800…
## $ sex               <chr> "male", "female", "female", "female", "male", "femal…
## $ year              <dbl> 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007…

Grammar of graphics

# Basic empty canvas
ggplot()

# We defined the mapping - where is the plot?
ggplot(data = penguins,
       mapping = aes(x = body_mass_g, y = flipper_length_mm))

To actually build the plot, we need to specify a geom and add it to the plot.

ggplot(penguins, aes(x = body_mass_g, y = flipper_length_mm)) +
  geom_point()

ggplot(penguins, aes(x = body_mass_g, y = flipper_length_mm)) +
  geom_smooth()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

ggplot(penguins, aes(x = body_mass_g, y = flipper_length_mm)) + 
  geom_point() + 
  geom_smooth()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

Customization

We can customize several aspects of our plot.

ggplot(penguins, aes(x = body_mass_g, y = flipper_length_mm)) +
  geom_point(size = 5, color = "red")

ggplot(penguins, aes(x = body_mass_g, y = flipper_length_mm)) +
  geom_point(shape = 15, alpha = 0.5) +
  geom_line() +
  geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'

The gray ribbon represents a confidence interval, and can be omitted. We can also use different methods to obtain a line.

ggplot(penguins, aes(x = body_mass_g, y = flipper_length_mm)) +
  geom_point() +
  geom_smooth(se = FALSE)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

ggplot(penguins, aes(x = body_mass_g, y = flipper_length_mm)) +
  geom_point() +
  geom_smooth(se = TRUE,level = .999, method = "lm")
## `geom_smooth()` using formula = 'y ~ x'

1-variable plots

Histograms, density plots, and box plots are useful for visualizing a single variable.

ggplot(penguins, aes(x = flipper_length_mm)) + 
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

# Can customize the look
ggplot(penguins, aes(x = flipper_length_mm)) + 
  geom_histogram(
    color = "steelblue4",
    fill = "skyblue1"
  )
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

We can use binwidth, bins, center, and boundary to custoimze the bins. These two plots have the same output:

ggplot(penguins, aes(x = flipper_length_mm)) + 
  geom_histogram(
    binwidth = 5,
    boundary = 200,
    color = "steelblue4",
    fill = "skyblue1"
  )

ggplot(penguins, aes(x = flipper_length_mm)) + 
  geom_histogram(
    binwidth = 5,
    center = 202.5,
    color = "steelblue4",
    fill = "skyblue1"
  )

Try to get a good balance - not too many and not too few bins!

ggplot(penguins, aes(x = flipper_length_mm)) +
  geom_histogram(
    bins = 100,
    color = "steelblue4",
    fill = "skyblue1")

ggplot(penguins, aes(x = flipper_length_mm)) +
  geom_histogram(
    bins = 5,
    color = "steelblue4",
    fill = "skyblue1")

Density plots are a “smooth” version of a histogram.

ggplot(penguins, aes(flipper_length_mm)) +
  geom_density(
    color = "goldenrod3",
    fill = "gold",
    size = 1
  )
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# Overlaying both plots
ggplot(penguins, aes(flipper_length_mm)) +
  geom_histogram(
    aes(y = after_stat(density)), # This line shrinks the histogram's height to be on the same scale as geom_density
    color = "steelblue4",
    fill = "skyblue1"
  ) +
  geom_density(
    color = "goldenrod3",
    fill = "gold",
    alpha = 0.3, # Transparency... the density plot is "on top"
    size = 2
  )
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

Boxplots visualize the quartiles of our data.

ggplot(penguins, aes(x = flipper_length_mm)) +
  geom_boxplot(fill = "skyblue")

Let’s re-create the one-variable plots, but color by species.

ggplot(penguins, aes(x = flipper_length_mm, color = species)) +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

ggplot(penguins, aes(x= flipper_length_mm, fill = species)) +
  geom_density(alpha = 0.3)

ggplot(penguins, aes(x = flipper_length_mm)) +
  geom_boxplot(fill = "skyblue", alpha = 0.5)

Bar graphs

Bar graphs depict a single categorical variable, like species. geom_bar() counts the number of instances in a dataset.

ggplot(penguins, aes(x = species)) +
  geom_bar()

With geom_col(), you specify the heights of the bars yourself. It is more flexible than geom_bar() but more work to use.

Let’s make a summarized version of the data with the counts and average flipper length of each species.

# We'll cover this code next week!
penguin_summary <- penguins %>%
  group_by(species) %>%
  summarize(n_penguins = n(),
            mean_length = mean(flipper_length_mm))

penguin_summary
## # A tibble: 3 × 3
##   species   n_penguins mean_length
##   <chr>          <int>       <dbl>
## 1 Adelie           146        190.
## 2 Chinstrap         68        196.
## 3 Gentoo           119        217.
# This code recreates the geom_bar graph
ggplot(penguin_summary, aes(x = species, y = n_penguins)) +
  geom_col()

# But we can make other plots as well
ggplot(penguin_summary, aes(x = species, mean_length)) +
  geom_col()

We can add fill as a constant or variable aesthetic.

ggplot(penguins, aes(x = species)) +
  geom_bar(fill = "lightblue", col = "black")

ggplot(penguins, aes(x = species, fill = sex)) +
  geom_bar()

#position = "dodge" places bars next to each other as opposed to stacked
ggplot(penguins, aes(x = species, fill = sex)) +
  geom_bar(position = "dodge")

Create a bar-plot showing the count of each penguin species by island. Decide how you want to assign the aesthetics to our two variables of interest.

ggplot(penguins, aes(species, fill = island)) +
  geom_bar(position = "dodge")

Variable aesthetics

Let’s look at a scatterplot of bill depth vs flipper length.

ggplot(penguins, aes(x = bill_depth_mm, y = flipper_length_mm)) +
  geom_point() +
  geom_smooth(method = "lm", se = T)
## `geom_smooth()` using formula = 'y ~ x'

What if we want to color the points by species?

ggplot(penguins, aes(x = bill_depth_mm, y = flipper_length_mm,
                     col = species)) +
  geom_point() +
  geom_smooth(method = "lm", se = F)
## `geom_smooth()` using formula = 'y ~ x'

What if we want the points to be colored, but keep only a single smooth line? We have to specify the color mapping within geom_point() rather than the original ggplot(). This is called a local aesthetic.

ggplot(penguins, aes(x = bill_depth_mm, y = flipper_length_mm)) +
  geom_point(aes(col = species)) +
  geom_smooth(method = "lm", se = F)
## `geom_smooth()` using formula = 'y ~ x'

What’s wrong with the following plots? How can we fix them?

# Read and interpret the error messages from these two plots
ggplot(penguins, aes(bill_depth_mm, flipper_length_mm)) +
  geom_point(aes(color = species))

ggplot(penguins, aes(x = bill_depth_mm, y = flipper_length_mm)) +
  geom_point(aes(color = species))

# Why are these points not huge? Why is there a legend for "size"?
ggplot(penguins, aes(x = bill_depth_mm, y = flipper_length_mm, color = species)) +
  geom_point(size = 2, aes(color = species)) +
  geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'

#The size specification was pout inside the aesthetic mapping, while it should have been outside of the aesthetic mapping.

# This just produces a gridded canvas with no points. Why?
ggplot(penguins, aes(x = bill_depth_mm, y = flipper_length_mm,
                    color = species)) +
  geom_point()

#There was no plus sign after the ggplot line of code

Line plots

Line plots are useful when our data is organized chronologically. Let’s work with a dataset showing U.S. college enrollment data by sex.

# Another way to read data: from the internet
enrollment <- read_csv("https://bwu62.github.io/stat240-revamp/data/enrollment.csv")
## Rows: 148 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): sex
## dbl (2): year, enrolled_millions
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
glimpse(enrollment)
## Rows: 148
## Columns: 3
## $ year              <dbl> 1947, 1947, 1948, 1948, 1949, 1949, 1950, 1950, 1951…
## $ sex               <chr> "male", "female", "male", "female", "male", "female"…
## $ enrolled_millions <dbl> 1.659249, 0.678977, 1.709367, 0.694029, 1.721572, 0.…

Here’s the data as a scatterplot.

ggplot(enrollment, aes(x = year, y = enrolled_millions,
                       color = sex)) +
  geom_point()

It makes sense to connect consecutive points with lines.

ggplot(enrollment, aes(x = year, y = enrolled_millions,
                       color = sex, linetype = sex)) +
  geom_line()

This is different from a smooth line - there’s no smoothing going on here!

Consider the following plot. What happens when color is changed to be a local aesthetic within geom_point()?

ggplot(enrollment, aes(x = year, y = enrolled_millions,
                       color = sex)) +
  geom_line() +
  geom_point()

More customization

There are countless options for customization in ggplot2. There is tons of information here - I encourage you to use this section as a reference that you can return to as needed.

Line and text annotations

There are special geometric objects for annotating plots.

mean_length <- mean(penguins$flipper_length_mm)

ggplot(penguins, aes(flipper_length_mm)) +
  geom_density(
    color = "goldenrod3",
    fill = "gold",
    size = 2
  ) +
  geom_vline(xintercept = mean_length,
             size = 2) 

ggplot(penguins, aes(flipper_length_mm)) +
  geom_density(
    color = "goldenrod3",
    fill = "gold",
    size = 2
  ) +
  geom_vline(xintercept = mean_length,
             size = 1.5) +
  geom_text(aes(x = 215, y = 0.03,
                label = "Mean Flipper Length"),
                stat = "unique")
## Warning in geom_text(aes(x = 215, y = 0.03, label = "Mean Flipper Length"), : All aesthetics have length 1, but the data has 333 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
##   a single row.

ggplot(penguins, aes(x = flipper_length_mm, fill = species)) +
  geom_density(alpha = 0.3) +
  geom_vline(data = penguin_summary,
             aes(xintercept = mean_length, col = species))

Axis labels

We should also add plot titles and improved axis labels for clarity. This is essential for any plot that appears in a report and is meant to communicate information to a larger group.

ggplot(penguins, aes(x = flipper_length_mm, fill = species)) +
  labs(
    title = "Density of Penguin Flipper Lengths",
    subtitle = "Vertical lines indicate average length",
    caption = "STAT 240",
    
    x = "Flipper Length (mm)",
    y = "Density",
    fill = "Species",
    col = "Species"
  ) +
  geom_density(alpha = 0.3) +
  geom_vline(data = penguin_summary,
             aes(xintercept = mean_length, col = species))

Scales allow us more fine-grained control over how aesthetics are displayed. For example, we might want to customize the spacing of the x or y axes.

ggplot(penguins, aes(x = flipper_length_mm)) + 
  geom_histogram(
    binwidth = 5,
    boundary = 70,
    color = "steelblue4",
    fill = "skyblue1"
  )

ggplot(penguins, aes(x = flipper_length_mm)) + 
  geom_histogram(
    binwidth = 5,
    boundary = 70,
    color = "steelblue4",
    fill = "skyblue1"
  ) +
  # Custom x axis
  scale_x_continuous(breaks = seq(170, 240, by = 10))

ggplot(penguins, aes(x = flipper_length_mm)) +
  geom_boxplot()

# Add some extra spacing to the y-axis in the boxplot
ggplot(penguins, aes(x = flipper_length_mm)) +
  geom_boxplot() +
  scale_y_continuous(limits = c(-1, 1))

This also works with discrete scales.

ggplot(penguins, aes(x = species, fill = sex)) +
  geom_bar(position = "dodge")

scientific_names <- c("Pygoscelis adeliae", "Pygoscelis antarcticus",
                      "Pygoscelis papua")

ggplot(penguins, aes(x = species, fill = sex)) +
  geom_bar(position = "dodge") +
  scale_x_discrete(label = scientific_names)

Colors

We can also use custom color schemes for the color and fill aesthetics (also can be continuous or discrete). The default color scheme can be difficult to distinguish for certain types of color blindness.

ggplot(penguins, aes(x = flipper_length_mm, fill = species)) +
  geom_density(alpha = 0.3)

We can use viridis palettes for either the fill or the color aesthetic.

ggplot(penguins, aes(x = flipper_length_mm, fill = species)) +
  geom_density(alpha = 0.3) +
  scale_fill_viridis_d()

ggplot(penguins, aes(x = flipper_length_mm, fill = species)) +
  geom_density(alpha = 0.3) +
  scale_fill_viridis_d(option = "inferno")

We can also make our own custom color scheme.

ggplot(penguins, aes(x = flipper_length_mm, fill = species)) +
  geom_density(alpha = 0.3) +
  scale_fill_manual(
    values = c("Adelie" = "dodgerblue",
               "Chinstrap" = "peachpuff",
               "Gentoo" = "mediumorchid")
    )

Here’s an example of a continuous color scheme.

ggplot(penguins, aes(x = body_mass_g, y = flipper_length_mm,
                     color = flipper_length_mm)) +
  geom_point() +
  scale_color_viridis_c(option = "cividis")

ggplot(penguins, aes(x = body_mass_g, y = flipper_length_mm,
                     color = flipper_length_mm)) +
  geom_point() +
  scale_color_gradient(low = "tomato", high = "dodgerblue")

Re-create a plot

Let’s try recreating the ESPN plot of football teams.

Set your y aesthetic to reorder(team, points) to re-order the team bars based on the number of points.

team <- c("Wolves", "Aston Villa", "Liverpool",
           "Tottenham", "Man City")
ESPN <- tibble(team,
       points = c(2.0, 2.0, 2.2, 2.4, 2.5))

ESPN
## # A tibble: 5 × 2
##   team        points
##   <chr>        <dbl>
## 1 Wolves         2  
## 2 Aston Villa    2  
## 3 Liverpool      2.2
## 4 Tottenham      2.4
## 5 Man City       2.5
ggplot(ESPN, aes(x= points, y= reorder(team, points), fill = team)) +
  geom_col() +
  labs(totle = "Premieer League Points Per Game MW 16-20")
## Ignoring unknown labels:
## • totle : "Premieer League Points Per Game MW 16-20"

Other topics

Faceting

Sometimes, it is easier to view variables as separate graphs rather than as aesthetics. facet_wrap() separates the plot into separate graphs for each level of a categorical variable.

ggplot(penguins, aes(x = body_mass_g, fill = species)) +
  geom_density(alpha = 0.3)

ggplot(penguins, aes(x = body_mass_g)) +
  geom_density() +
  facet_wrap(facets = vars(species))

facet_grid() lets us facet based on two variables.

ggplot(penguins, aes(x = body_mass_g)) +
  geom_density() +
  facet_grid(rows = vars(species), cols = vars(sex))

# This plot is much harder to read:
ggplot(penguins, aes(x = body_mass_g,fill = species,
                     linetype = sex)) +
  geom_density(alpha = 0.5, size = 1)

Mathematical functions

geom_function() can be used to plot y = f(x) for any pre-defined R function or for your own mathematical function. It creates a smooth line plot.

For this type of plot, we don’t provide data, but provide all of the necessary information within geom_function().

ggplot() + geom_function(
  fun = \(x) x^2 + 1,      # define x²+1
  xlim = c(-2, 2),         
  n = 1001                 # increase number of points used in drawing
)

Here’s an example using a pre-defined function, dnorm(), which creates a normal bell-curve.

# plot the normal distribution with mean 10 and sd 2
ggplot() + geom_function(
  fun = dnorm,
  args = list(mean = 10, sd = 2),  # set mean and sd arguments inside dnorm()
  xlim = c(4, 16),
  n = 1001
)

Example: How well do the male Adelie penguin body masses match a theoretical normal distribution?

penguins_small <- penguins %>%
  filter(species == "Adelie", sex == "male")

# Get mean and sd to draw normal curve
obs_mean <- mean(penguins_small$body_mass_g)
obs_sd <- sd(penguins_small$body_mass_g)


ggplot(penguins_small, aes(body_mass_g)) +
  geom_density(color = "goldenrod3", lwd = 1.5) +
  
  geom_function(
  fun = dnorm,
  args = list(mean = obs_mean, sd = obs_sd),
  n = 1001
)